Week 4: Clustering and classification

date() 
## [1] "Mon Nov 27 16:02:38 2023"

1) Data wrangling exercises

R-Script: https://github.com/venkatharina/IODS-project/blob/Data/create_human.R

2) Analysis exercises

library(MASS)
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)

#loading the data
data("Boston")

#exploring the dataset
dim(Boston) #504 rows and 14 variables
## [1] 506  14
str(Boston) #numeric and integrin vectors
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#Data describes methodological problems associated with the use of housing market data to measure the willingness to pay for clean air
#With the use of a hedonic housing price model and data for the Boston metropolitan area, quantitative estimates of the willingness to pay for air quality improvements are generated
#Marginal air pollution damages (as revealed in the housing market) are found to increase with the level of air pollution and with household income

#graphical overview and summary of the data:
plot(Boston)
pairs(Boston)

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
cor_matrix <- cor(Boston) 
corrplot(cor_matrix, method="circle")

#lots of variables, some seem to not correlate, some seem to correlate negative/positive to each others

#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
summary(boston_scaled)
##       crim                 zn               indus              chas        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
##       nox                rm               age               dis         
##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
##       rad               tax             ptratio            black        
##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
##      lstat              medv        
##  Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 3.5453   Max.   : 2.9865
#all the data was scaled to zero (mean to zero)

#`crim` (per capita crime rate by town)
#cut the variable by to get the high, low and middle rates of crime into their own categories
boston_scaled$crim <- as.numeric(boston_scaled$crim)
#using the quantiles as the break points (bins)
bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
#creating categorical variable
crime <- cut(boston_scaled$crim, breaks =bins, include.lowest = TRUE)
#removing original crim from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
#adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
#creating testing and training data:
n <- nrow(Boston)
ind <- sample(n,  size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
#doing linear discriminant analysis
lda.fit = lda(crime ~ ., data=train)
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
## [-0.419,-0.411]  (-0.411,-0.39] (-0.39,0.00739]  (0.00739,9.92] 
##       0.2500000       0.2574257       0.2500000       0.2425743 
## 
## Group means:
##                         zn      indus         chas        nox         rm
## [-0.419,-0.411]  1.0493320 -0.9304643 -0.155385496 -0.8878773  0.4706016
## (-0.411,-0.39]  -0.1353591 -0.2437711 -0.007331936 -0.5730003 -0.1256699
## (-0.39,0.00739] -0.3929954  0.1182795  0.234426408  0.3132186  0.1371383
## (0.00739,9.92]  -0.4872402  1.0171960 -0.071456607  1.0713166 -0.4152575
##                        age        dis        rad        tax      ptratio
## [-0.419,-0.411] -0.8993787  0.8765764 -0.6953230 -0.7140039 -0.476580685
## (-0.411,-0.39]  -0.3625265  0.3243112 -0.5500918 -0.4747678  0.008103357
## (-0.39,0.00739]  0.3891035 -0.3526371 -0.3928554 -0.3092396 -0.239225343
## (0.00739,9.92]   0.8330926 -0.8630831  1.6373367  1.5134896  0.779855168
##                       black       lstat        medv
## [-0.419,-0.411]  0.37068690 -0.78199177  0.55181657
## (-0.411,-0.39]   0.32217992 -0.14811224 -0.01590369
## (-0.39,0.00739]  0.09678494 -0.03410673  0.19494574
## (0.00739,9.92]  -0.85587610  0.86524580 -0.66349081
## 
## Coefficients of linear discriminants:
##                 LD1         LD2         LD3
## zn       0.07071560  0.86551072 -0.76408303
## indus    0.02786994 -0.39061897  0.53486577
## chas    -0.07947947 -0.13497302  0.05565763
## nox      0.45256883 -0.48908322 -1.40606522
## rm      -0.12443158 -0.02424534 -0.07767991
## age      0.23821036 -0.29926131 -0.33000640
## dis     -0.04982303 -0.32512804  0.16212076
## rad      3.20832668  0.67264314  0.29189832
## tax     -0.09653873  0.25276207  0.18181895
## ptratio  0.10087302  0.05322751 -0.14142084
## black   -0.13245317 -0.01033133  0.13599966
## lstat    0.18027739 -0.19713506  0.37187585
## medv     0.15809684 -0.28663775 -0.24576072
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9463 0.0389 0.0148
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  graphics::arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
#plotting lda results biplot)
plot(lda.fit, dimen = 2)
lda.arrows(lda.fit, myscale = 1)

#saving the crime categories from the test set
correct_classes <- test$crime
#then removing the categorical crime variable from the test dataset
test <- dplyr::select(test, -crime)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##                  predicted
## correct           [-0.419,-0.411] (-0.411,-0.39] (-0.39,0.00739] (0.00739,9.92]
##   [-0.419,-0.411]              12             14               0              0
##   (-0.411,-0.39]                3             14               5              0
##   (-0.39,0.00739]               0              5              19              1
##   (0.00739,9.92]                0              0               0             29
#predicted values seem correct only in class 4 (0.00739,9.92)
#mostly predictions seem okay-ish (majority in right class), but it could be better

#reloading the data
data("Boston")
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
summary(boston_scaled$dis)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.2658 -0.8049 -0.2790  0.0000  0.6617  3.9566
#calculating distances between the observations
distance <- dist(boston_scaled)
summary(distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1343  3.4625  4.8241  4.9111  6.1863 14.3970
#trying to identify right cluster number
set.seed(140)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#right cluster number seems to be 6 for k-means clustering
km <- kmeans(boston_scaled, centers = 6)
# plot the dataset with clusters
pairs(boston_scaled, col = km$cluster)

############ BONUS ############

#reloading data:
data("Boston")
#scaling data:
boston_scaled <- as.data.frame(scale(Boston))
#k-clustering
km3 <- kmeans(boston_scaled, centers = 8)
boston_scaled$kmcluster <- as.numeric(km3$cluster)
#making lda modeling
lda.fit2 = lda(kmcluster~., data=boston_scaled)
lda.fit2
## Call:
## lda(kmcluster ~ ., data = boston_scaled)
## 
## Prior probabilities of groups:
##          1          2          3          4          5          6          7 
## 0.21343874 0.04150198 0.21541502 0.06521739 0.10869565 0.15415020 0.10276680 
##          8 
## 0.09881423 
## 
## Group means:
##         crim           zn      indus        chas        nox          rm
## 1 -0.3768106 -0.425505051 -0.3701112  0.09221725 -0.2286148 -0.35000772
## 2  3.2082952 -0.487240187  1.0149946 -0.27232907  1.0998477 -1.45566793
## 3 -0.4060502 -0.005364112 -0.6583486 -0.27232907 -0.8405622 -0.04480167
## 4  1.1193656 -0.487240187  1.0149946 -0.27232907  0.9950574 -0.19450805
## 5 -0.4146327  2.524683754 -1.2040537 -0.20074543 -1.2034679  0.73835915
## 6  0.4620310 -0.487240187  1.0149946  0.13147608  1.0021383 -0.15800782
## 7 -0.2727474 -0.487240187  1.5613334  0.10623826  1.1631724 -0.63230595
## 8 -0.3681800 -0.053323566 -0.7442735  0.59383298 -0.2416605  1.68533550
##          age        dis        rad         tax     ptratio      black
## 1  0.3376923 -0.1286995 -0.5799077 -0.53244743  0.22283672  0.2729966
## 2  1.0064301 -1.0710604  1.6596029  1.52941294  0.80577843 -0.1691195
## 3 -1.0244892  0.8700429 -0.5720054 -0.70802741 -0.07989337  0.3722489
## 4  0.7406815 -0.8461740  1.6596029  1.52941294  0.80577843 -3.2850509
## 5 -1.4205771  1.6272606 -0.6832698 -0.53843478 -0.89403340  0.3540831
## 6  0.6920432 -0.7470446  1.6596029  1.52941294  0.80577843  0.1640227
## 7  0.9324774 -0.9083143 -0.6130366  0.03111252 -0.35520300 -0.1409865
## 8  0.1056916 -0.2903328 -0.4926242 -0.78414273 -1.08156698  0.3392477
##        lstat        medv
## 1  0.1641254 -0.25817616
## 2  2.0102765 -1.39479170
## 3 -0.5625097  0.09778119
## 4  1.1195132 -1.03254705
## 5 -0.9678176  0.83523455
## 6  0.3945960 -0.31539874
## 7  0.6799267 -0.51668841
## 8 -0.9695286  1.72241105
## 
## Coefficients of linear discriminants:
##                 LD1          LD2         LD3          LD4          LD5
## crim     0.19364384  0.126100236 -0.20191652 -0.376580103 -0.838531764
## zn      -0.05343615  1.094813039  1.50799254 -1.133131460  0.197678623
## indus    0.55399704 -1.280702654  1.13241037 -1.197258173  0.124603383
## chas    -0.03302284 -0.024772810 -0.14210875  0.080434989  0.122406528
## nox      0.05435103 -0.424948658  0.26397352 -0.771840326  0.285880375
## rm      -0.04813200  0.181941956  0.09971698  0.277384152  0.689446855
## age      0.14103637 -0.651476207 -0.09019746  0.008315478  0.502754473
## dis     -0.33219936  0.286780273  0.03476554 -0.281273725 -0.252480041
## rad      5.93891512  1.865585385 -1.41594980  0.744695993  0.427251550
## tax     -0.05791010  0.519204288  0.49438996 -0.580198607  0.087932891
## ptratio  0.21221433 -0.011384841  0.01954111  0.048612579 -0.116143664
## black   -0.19991944  0.269843607 -1.52326403 -1.543950260 -0.002263929
## lstat    0.07512642 -0.006494829  0.08239517  0.043407190 -0.209298121
## medv    -0.08095390  0.223086304 -0.03618324 -0.137450228  0.487212102
##                  LD6         LD7
## crim    -1.055793974  0.76410951
## zn      -0.472879553 -0.75684606
## indus    0.908040391  1.05671771
## chas    -0.139637868 -0.18984158
## nox      0.186010320  0.32881295
## rm      -0.077931268  0.31058189
## age     -0.573790467 -0.91403286
## dis      0.721069945  0.66003474
## rad      0.851480550  0.28819620
## tax     -0.245047274 -0.77133610
## ptratio -0.008931086 -0.50612180
## black    0.001693988 -0.02201946
## lstat   -0.741261471  0.23456713
## medv    -0.594939439  0.54523880
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4    LD5    LD6    LD7 
## 0.7632 0.1051 0.0506 0.0403 0.0205 0.0141 0.0062
#plotting 
plot(lda.fit2, dimen = 2)
lda.arrows(lda.fit2, myscale = 1)

############ SUPERBONUS ############

model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
#Next, install and access the plotly package. Create a 3D plot (cool!) of the columns of the matrix product using the code below.
#install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers')
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color= 'train$crime', colors=c('red','green','blue','violet'))